import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore")
Labels = ['Age','Workclass','fnlwgt','Education','EducationClass','Marital_status','Occupation','Relationship','Race','Sex',
'Capital_gain','Capital_loss','Hours_per_week','Native_country','Income_band']
path = "/Users/alvis/Downloads/adult.data"
data = pd.read_csv(path, names = Labels, skipinitialspace=True)
adult_data = data.drop(["Education","Relationship"], axis=1)
print(adult_data.info())
print(f'Unique Income_band values are: {adult_data.Income_band.unique()}\n')
print(f'Unique Workclass values are: {adult_data.Workclass.unique()}\n')
print(f'Unique Marital_status values are: {adult_data.Marital_status.unique()}\n')
print(f'Unique Occupation values are: {adult_data.Occupation.unique()}\n')
print(f'Unique Race values are: {adult_data.Race.unique()}\n')
print(f'Unique Sex values are: {adult_data.Sex.unique()}\n')
print(f'Unique Native_country values are: {adult_data.Native_country.unique()}')
<class 'pandas.core.frame.DataFrame'> RangeIndex: 32561 entries, 0 to 32560 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 32561 non-null int64 1 Workclass 32561 non-null object 2 fnlwgt 32561 non-null int64 3 EducationClass 32561 non-null int64 4 Marital_status 32561 non-null object 5 Occupation 32561 non-null object 6 Race 32561 non-null object 7 Sex 32561 non-null object 8 Capital_gain 32561 non-null int64 9 Capital_loss 32561 non-null int64 10 Hours_per_week 32561 non-null int64 11 Native_country 32561 non-null object 12 Income_band 32561 non-null object dtypes: int64(6), object(7) memory usage: 3.2+ MB None Unique Income_band values are: ['<=50K' '>50K'] Unique Workclass values are: ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?' 'Self-emp-inc' 'Without-pay' 'Never-worked'] Unique Marital_status values are: ['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent' 'Separated' 'Married-AF-spouse' 'Widowed'] Unique Occupation values are: ['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty' 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving' 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' '?' 'Protective-serv' 'Armed-Forces' 'Priv-house-serv'] Unique Race values are: ['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other'] Unique Sex values are: ['Male' 'Female'] Unique Native_country values are: ['United-States' 'Cuba' 'Jamaica' 'India' '?' 'Mexico' 'South' 'Puerto-Rico' 'Honduras' 'England' 'Canada' 'Germany' 'Iran' 'Philippines' 'Italy' 'Poland' 'Columbia' 'Cambodia' 'Thailand' 'Ecuador' 'Laos' 'Taiwan' 'Haiti' 'Portugal' 'Dominican-Republic' 'El-Salvador' 'France' 'Guatemala' 'China' 'Japan' 'Yugoslavia' 'Peru' 'Outlying-US(Guam-USVI-etc)' 'Scotland' 'Trinadad&Tobago' 'Greece' 'Nicaragua' 'Vietnam' 'Hong' 'Ireland' 'Hungary' 'Holand-Netherlands']
data.head()
| Age | Workclass | fnlwgt | Education | EducationClass | Marital_status | Occupation | Relationship | Race | Sex | Capital_gain | Capital_loss | Hours_per_week | Native_country | Income_band | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
| 1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
| 2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
| 3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
| 4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
adult_data['Race'] = ['White' if i=='White' else 'Non-White' for i in adult_data['Race']]
adult_data['Sex'] = ['Male' if j=='Male' else 'Female' for j in adult_data['Sex']] #Apparently we only have 2 sex in our dataset
pd.options.mode.chained_assignment = None
High_Income =['United-States', 'Germany', 'Ireland', 'Italy', 'Canada', 'Puerto-Rico', 'Japan', 'England', 'Hong', 'Portugal', 'Poland', 'Outlying-US(Guam-USVI-etc)','Trinadad&Tobago','Greece', 'France', 'Scotland', 'Hungary','Holand-Netherlands']
Upper_Middle_Income = [ 'Mexico', 'Columbia', 'Guatemala', 'Jamaica', 'Dominican-Republic', 'Cuba', 'China', 'Thailand', 'Ecuador', 'Yugoslavia', 'Peru']
Lower_Middle_Income = ['Philippines', 'Haiti', 'Nicaragua', 'India', 'Vietnam', 'Iran', 'Cambodia', 'El-Salvador', 'Honduras', 'Laos']
Not_Classified = ['Taiwan', '?', 'South']
adult_data['Native_country'] = adult_data.Native_country.replace(High_Income,'High_Inc')
adult_data['Native_country'] = adult_data.Native_country.replace(Upper_Middle_Income,'Up_Mid_Inc')
adult_data['Native_country'] = adult_data.Native_country.replace(Lower_Middle_Income,'Low_Mid_Inc')
adult_data['Native_country'] = adult_data.Native_country.replace(Not_Classified,'Not_Classified')
no_spouse = ['Married-spouse-absent','Separated','Widowed','Divorced']
adult_data['Marital_status'] = adult_data.Marital_status.replace(no_spouse,'Not with Spouse' )
subdf = adult_data
a = subdf.groupby('EducationClass').size()
b = subdf.groupby('Sex').size()
c = subdf.groupby('Native_country').size()
d = subdf.groupby('Age').size()
e = subdf.groupby('Race').size()
f = subdf.groupby('Marital_status').size()
g = subdf.groupby('Workclass').size()
h = subdf.groupby('Hours_per_week').size()
col = [a,b,c,d,e,f,g,h]
x = ['Education','Sex','Native Country','Age','Race','Marital Status','Workclass','Hours_per_week']
plt.figure(figsize=[18,14])
for i in range(len(x)):
plt.subplot(2,4,i+1)
plt.bar(col[i].index, col[i])
plt.xticks(rotation=30)
plt.title(x[i])
plt.suptitle('Fig 1: Distribution of Major Variables', size=16)
plt.show()
The above figure shows the distribution of various important attributes of the dataset, to help understand the representation of majority groups and the marginalised groups in various areas. It would help us identify and focus on those groups for biases or unfairness. The dominant groups are
- Sex - Males
- Race - Whites
- Native Country - High Income Countries
- Occupation - Private
- Marital Status - Married-to-civilian
- Education - Level 10
- Age - 35-50 years
- Hours-of-week - 40 per week
# Encoding categorical columns in numeric terms
import copy
new_adult_data = copy.copy(adult_data)
new_adult_data['Workclass'] = new_adult_data.Workclass.astype('category').cat.codes
new_adult_data['Marital_status'] = new_adult_data.Marital_status.astype('category').cat.codes
new_adult_data['Occupation'] = new_adult_data.Occupation.astype('category').cat.codes
new_adult_data['Race'] = new_adult_data.Race.astype('category').cat.codes
new_adult_data['Sex'] = new_adult_data.Sex.astype('category').cat.codes
new_adult_data['Native_country'] = new_adult_data.Native_country.astype('category').cat.codes
adult_data_input = new_adult_data.drop(['Income_band'],axis=1)
adult_data_output = new_adult_data['Income_band']
X_train,X_test,y_train,y_test = train_test_split(adult_data_input,adult_data_output, test_size=0.33, shuffle=False) #Data has been split randomly with shuffle on, so that fairness in data selection can be maintained
- Hyperparameter Optimising Algorithm - GridSearchCV from sklearn
- Cross-Validation Algorithm - RepeatedStratifiedKFold
- Evaluation Metric - F1
- Model 1 - Decision Tree (depth=3)
- Model 2 - Decision Tree (depth=7)
- Model 3 - Logistic Regression
- Model 4 - Random Forest
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score,f1_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
def hyperparameterOptmisation(model,X_train,y_train):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=1)
if model == DecisionTreeClassifier:
param_grid = {'criterion': ['gini', 'entropy'],
'max_depth': [ 3, 4, 5, 6, 7],
'min_samples_split': [2, 3, 4, 5],
'min_samples_leaf': [1, 2, 3, 4]} ######## We can add class_weights here with every algorithm, if we want to make out model more fair
elif model == LogisticRegression:
param_grid = {'solver': ['newton-cg', 'lbfgs', 'liblinear'],
'penalty': ['none', 'l1', 'l2', 'elasticnet'],
'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]}
elif model == RandomForestClassifier:
param_grid = {'n_estimators': [10, 25, 50],
'max_depth': [None, 5, 10, 20]}
search = GridSearchCV(model(), param_grid, scoring='f1', cv=cv)
result = search.fit(X_train, y_train)
print(f'Best Score for {model}: {result.best_score_}')
print(f'Best Hyperparameters: {result.best_params_}')
return result
Models = [DecisionTreeClassifier,LogisticRegression,RandomForestClassifier]
for model in Models:
hyperparameterOptmisation(model,X_train,y_train)
Best Score for <class 'sklearn.tree._classes.DecisionTreeClassifier'>: nan
Best Hyperparameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Score for <class 'sklearn.linear_model._logistic.LogisticRegression'>: nan
Best Hyperparameters: {'C': 1e-05, 'penalty': 'none', 'solver': 'newton-cg'}
Best Score for <class 'sklearn.ensemble._forest.RandomForestClassifier'>: nan
Best Hyperparameters: {'max_depth': None, 'n_estimators': 10}
def DecisionTrees(X_train,y_train,X_test,d):
regressor = DecisionTreeClassifier(criterion='gini', max_depth=d, min_samples_leaf= 3, min_samples_split=5)
regressor.fit(X_train, y_train)
score = regressor.score(X_train, y_train)
prediction = regressor.predict(X_test)
feature_importance = regressor.feature_importances_
return score,prediction,feature_importance,regressor
def LogisticRegressor(X_train,y_train,X_test):
regressor = LogisticRegression(penalty='l1',solver='liblinear',C=10)
regressor.fit(X_train, y_train)
score = regressor.score(X_train, y_train)
prediction = regressor.predict(X_test)
weights = np.exp(regressor.coef_[0])
return score, prediction, weights,regressor
def RandomForest(X_train,y_train,X_test):
regressor = RandomForestClassifier(n_estimators=50, max_depth=20)
regressor.fit(X_train, y_train)
score = regressor.score(X_train, y_train)
prediction = regressor.predict(X_test)
weights = regressor.feature_importances_
return score, prediction, weights, regressor
DT3 = DecisionTrees(X_train,y_train, X_test, 3)
DT7 = DecisionTrees(X_train,y_train, X_test, 7)
LogR = LogisticRegressor(X_train,y_train,X_test)
RandF = RandomForest(X_train,y_train,X_test)
def F1Score(y_test, y_pred):
conf_matrix = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = conf_matrix.ravel()
Precision = TP/(TP+FP)
Recall = TP/(TP+FN)
F1score = (2*Recall*Precision)/(Recall+Precision)
return F1score, Recall
Model_names = ['DT3','DT7','LogR','RandF']
scores=[]
for model in [DT3,DT7,LogR,RandF]:
scores.append(F1Score(y_test, model[1])[0])
df = pd.DataFrame(scores,index=Model_names, columns=['F1Score']).sort_values(by=['F1Score'],ascending=True)
import plotly.graph_objects as go
fig=go.Figure()
def SetColor(y):
if(y >= 0.7):
return "green"
elif(y >= 0.65):
return "orange"
elif(y >= 0.6):
return "red"
fig.add_trace(go.Bar (name='F1 Absolute Score',x=df.index, y=df.F1Score, marker=dict(color = list(map(SetColor, df.F1Score)))))
fig.add_trace(go.Scatter(name='F1 Trend', x=df.index, y=df.F1Score, mode='lines'))
fig.update_layout(autosize=False,width=800,height=300,xaxis_title='Models', yaxis_title='F1 Score', title='Fig 2: F1 Score for Models', title_x=0.5)
fig.show()
# ROC Curve
from sklearn import metrics
y_test2 = [0 if i== '<=50K' else 1 for i in y_test] # Do not run it all over again
Model_names = ['DT3','DT7','LogR','RandF']
models = [DT3,DT7,LogR,RandF]
for j in range(len(models)):
plt.subplot(2,2,j+1)
y_pred = [0 if i=='<=50K' else 1 for i in models[j][1]]
fpr, tpr, _ = metrics.roc_curve(y_test2, y_pred)
auc = metrics.roc_auc_score(y_test2, y_pred)
plt.plot(fpr,tpr,label= Model_names[j]+", auc="+str(auc))
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'Fig 3({j+1}): Receiver operating characteristic')
plt.show()
The above ROC plots show the performnace of Random Forest as the best among all models.
LIME - Locally Interpretable Model-Agnostic Explanations facilitates the interpretation of the model, by approximating it locally. The output of LIME provides an intuiton into the innerworkings of ML algortihms and the feature used. When used to interpret a blackbox ML model, it can help build the credibility of the results produced by the blackbox ML model.
# Test dataframe that contains recorsd with under-represented groups as well as well-represented groups to check bias using interpretation models.
test_df=[]
test_df.append(adult_data[(adult_data.Race=='White') & (adult_data.Income_band=='<=50K')].iloc[0])
test_df.append(adult_data[(adult_data.Race=='Non-White') & (adult_data.Income_band=='>50K')].iloc[0])
test_df.append(adult_data[(adult_data.Native_country=='Low_Mid_Inc') & (adult_data.Income_band=='>50K')].iloc[0])
test_df.append(adult_data[(adult_data.Native_country=='High_Inc') & (adult_data.Income_band=='<=50K')].iloc[1])
test_df.append(adult_data[(adult_data.Sex=='Female') & (adult_data.Income_band=='>50K') ].iloc[0])
test_df.append(adult_data[(adult_data.Sex=='Male') & (adult_data.Income_band=='<=50K') ].iloc[2])
test_df.append(adult_data[(adult_data.Workclass=='Self-emp-inc') & (adult_data.Income_band=='>50K') ].iloc[0])
test_df.append(adult_data[(adult_data.Workclass=='Private') & (adult_data.Income_band=='<=50K') ].iloc[1])
test_df.append(adult_data[(adult_data.Age<=50) & (adult_data.Income_band=='<=50K')].iloc[4])
test_df.append(adult_data[(adult_data.Age>50) & (adult_data.Income_band=='>50K')].iloc[0])
test_df.append(adult_data[(adult_data.EducationClass==8) & (adult_data.Income_band=='<=50K')].iloc[5])
test_df.append(adult_data[(adult_data.EducationClass==15) & (adult_data.Income_band=='>50K')].iloc[0])
test_df = pd.DataFrame(test_df)
# test_df = test_df.reset_index(drop=True)
test_df
| Age | Workclass | fnlwgt | EducationClass | Marital_status | Occupation | Race | Sex | Capital_gain | Capital_loss | Hours_per_week | Native_country | Income_band | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39 | State-gov | 77516 | 13 | Never-married | Adm-clerical | White | Male | 2174 | 0 | 40 | High_Inc | <=50K |
| 10 | 37 | Private | 280464 | 10 | Married-civ-spouse | Exec-managerial | Non-White | Male | 0 | 0 | 80 | High_Inc | >50K |
| 11 | 30 | State-gov | 141297 | 13 | Married-civ-spouse | Prof-specialty | Non-White | Male | 0 | 0 | 40 | Low_Mid_Inc | >50K |
| 1 | 50 | Self-emp-not-inc | 83311 | 13 | Married-civ-spouse | Exec-managerial | White | Male | 0 | 0 | 13 | High_Inc | <=50K |
| 8 | 31 | Private | 45781 | 14 | Never-married | Prof-specialty | White | Female | 14084 | 0 | 50 | High_Inc | >50K |
| 2 | 38 | Private | 215646 | 9 | Not with Spouse | Handlers-cleaners | White | Male | 0 | 0 | 40 | High_Inc | <=50K |
| 68 | 49 | Self-emp-inc | 191681 | 10 | Married-civ-spouse | Exec-managerial | White | Male | 0 | 0 | 50 | High_Inc | >50K |
| 3 | 53 | Private | 234721 | 7 | Married-civ-spouse | Handlers-cleaners | Non-White | Male | 0 | 0 | 40 | High_Inc | <=50K |
| 5 | 37 | Private | 284582 | 14 | Married-civ-spouse | Exec-managerial | White | Female | 0 | 0 | 40 | High_Inc | <=50K |
| 7 | 52 | Self-emp-not-inc | 209642 | 9 | Married-civ-spouse | Exec-managerial | White | Male | 0 | 0 | 45 | High_Inc | >50K |
| 744 | 17 | Private | 316929 | 8 | Never-married | Handlers-cleaners | White | Male | 0 | 0 | 20 | High_Inc | <=50K |
| 52 | 47 | Private | 51835 | 15 | Married-civ-spouse | Prof-specialty | White | Female | 0 | 1902 | 60 | Low_Mid_Inc | >50K |
import lime
import lime.lime_tabular
cat_features = ['Workclass', 'EducationClass', 'Marital_status','Occupation', 'Race', 'Sex', 'Native_country']
# LIME requires class probabilities in case of classification example
explainer = lime.lime_tabular.LimeTabularExplainer(np.array(X_train), feature_names=X_train.columns, class_names =['<=50K','>50k'],
categorical_features=cat_features,
mode='classification')
test_df_records = test_df.index
record = test_df_records[7]
print('RECORD:')
print(f'**************************************************************************************************************' )
# print(f'adult_income_dataset Record - {record} True Income : {y_test.iloc[record]}')
print(f'{test_df.loc[record]}')
print(f'**************************************************************************************************************' )
print('\n')
Model_names = ['DT3', 'LogR', 'RandF']
model = [DT3,LogR,RandF]
for i in range(len(model)):
predict_fn = lambda x: model[i][3].predict_proba(x).astype(float)
exp = explainer.explain_instance(X_test.iloc[record], predict_fn, num_features=10)
print(f'Prediction and Local Interpretation for: {Model_names[i]}')
print(f'----------------------------------------------------------------------------------------------------------------------------------\n' )
exp.show_in_notebook()
RECORD: ************************************************************************************************************** Age 53 Workclass Private fnlwgt 234721 EducationClass 7 Marital_status Married-civ-spouse Occupation Handlers-cleaners Race Non-White Sex Male Capital_gain 0 Capital_loss 0 Hours_per_week 40 Native_country High_Inc Income_band <=50K Name: 3, dtype: object ************************************************************************************************************** Prediction and Local Interpretation for: DT3 ----------------------------------------------------------------------------------------------------------------------------------
Prediction and Local Interpretation for: LogR ----------------------------------------------------------------------------------------------------------------------------------
Prediction and Local Interpretation for: RandF ----------------------------------------------------------------------------------------------------------------------------------
Using LIME interpretations, we can see that models classify people in the '<=50K' category, even when the probability is as low as 0.6-0.7s
But since we want to minimise the chances of bias in predictions, we can put a check on our predictions puts classifies people only if the probability if >0.8 and if it is lower, the case should be highlighted by the AI model, for a human to intervene.
This can be a great example of how Human-centered AI can help minimise inequality propagated by AI models.
#Global Framework
#!conda install -c conda-forge Skater -y
from skater.core.explanations import Interpretation
from skater.model import InMemoryModel
Model_names = ['DT3', 'LogR', 'RandF']
Models = [DT3,LogR,RandF]
interpreter = Interpretation(training_data=X_test, feature_names=X_test.columns)
for i in range(len(Models)):
predict_fn = lambda x: Models[i][3].predict_proba(x).astype(float)
global_model = InMemoryModel(predict_fn, examples=X_train, target_names=['<=50K','>50k'])
fig,ax = interpreter.feature_importance.plot_feature_importance(global_model, progressbar=False)#plotting feature importance
fig.suptitle(f'Fig 4: Global Surrogate Interpretation for: {Model_names[i]}',size=16 )
On manually running the LIME code on test_df records to check if there is a bias,
We divide the benchmark for income of a person in 3 ways:
- The ideal case where it depends on education, age, work-hours, workclass, capital-gain
- The Global Interpretation where the overall features that contribute to the final prediction are considered as keys variables fo prediction
- The local interpretation where the variables considered to make the prediction(assumption in a way)for each person maybe different than the global feature importance.
new_adult_data['Income_band'] = new_adult_data.Income_band.astype('category').cat.codes
plt.figure(figsize=(4, 6))
heatmap = sns.heatmap(new_adult_data.corr()[['Income_band']].sort_values(by='Income_band', ascending=False), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Fig 5: Features Correlating with Income Band', fontdict={'fontsize':16}, pad=16);
correlation_data = (new_adult_data.corr()[['Income_band']])*100
Global_surrogate_data_DT3 = pd.DataFrame([('Age',0.1), ('Workclass',0), ('fnlwgt',0), ('EducationClass',0.32), ('Marital_status',0.45),('Occupation',0), ('Race',0), ('Sex',0), ('Capital_gain',0.21), ('Capital_loss',0),('Hours_per_week',0), ('Native_country',0)])
Global_surrogate_data_LogR = pd.DataFrame([('Age',0.135), ('Workclass',0.03), ('fnlwgt',0), ('EducationClass',0.245), ('Marital_status',0.145),('Occupation',0.02), ('Race',0.01), ('Sex',0.14), ('Capital_gain',0.15), ('Capital_loss',0.05),('Hours_per_week',0.09), ('Native_country',0.02)])
Global_surrogate_data_RandF = pd.DataFrame([('Age',0.14), ('Workclass',0.05), ('fnlwgt',0.068), ('EducationClass',0.155), ('Marital_status',0.23),('Occupation',0.085), ('Race',0.03), ('Sex',0.02), ('Capital_gain',0.09), ('Capital_loss',0.04),('Hours_per_week',0.09), ('Native_country',0)])
Global_surrogate_data = pd.DataFrame(index=Global_surrogate_data_DT3[0])
Global_surrogate_data['DT3'] = (Global_surrogate_data_DT3[1].values)*100
Global_surrogate_data['LogR'] = (Global_surrogate_data_LogR[1].values)*100
Global_surrogate_data['RandF'] = (Global_surrogate_data_RandF[1].values)*100
Global_surrogate_data['Correlation'] = correlation_data
Global_surrogate_data = Global_surrogate_data.sort_values(by='Correlation', ascending=False)
plt.figure(figsize=[18,6])
plt.bar(Global_surrogate_data.index,Global_surrogate_data.Correlation, color='r')
plt.bar(Global_surrogate_data.index,Global_surrogate_data.LogR, bottom=Global_surrogate_data.Correlation, color='b')
plt.bar(Global_surrogate_data.index,Global_surrogate_data.RandF, bottom=Global_surrogate_data.Correlation + Global_surrogate_data.LogR, color='g')
plt.bar(Global_surrogate_data.index,Global_surrogate_data.DT3, bottom=Global_surrogate_data.Correlation + Global_surrogate_data.LogR + Global_surrogate_data.RandF, color='y')
plt.title('Fig 6: Actual Correlations in Dataset vs Correlations interpreted by ML Models', size=16)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.legend(['Original Correlation','LogR','RandF','DT3'])
plt.show()
As per litertaure, Race, Sex, Native_country, marital status, socio-economic status can be considered as the most sensitive variables here, that could potentially face bias.
As we can see in the plot, highest importance has been given to factors like Education, Age, Hours-of-work and capital-gain by all the models.
But surprisingly, sensitive variables like Marital-status and Sex appear to have a considerable impact on the predictions.
Even though the original dataset does not show any correlation with Marital-status, all three models appear to have picked up a correlation with marital-status as well.
predictions = pd.DataFrame()
predictions['Actual'] = y_test
predictions['DT3'] = DT3[1]
predictions['DT7'] = DT7[1]
predictions['LogR'] = LogR[1]
predictions['RandF'] = RandF[1]
Model_names = ['DT3', 'DT7', 'LogR', 'RandF']
fig = plt.figure(figsize=[22,4])
for i in range(len(Model_names)):
conf_matrix = confusion_matrix(predictions['Actual'], predictions[Model_names[i]])
plt.subplot(1, 4, i+1)
sns.heatmap(conf_matrix, linewidths=1, annot=True, fmt='g', xticklabels=['<=50K','>50K'],yticklabels=['<=50K','>50K'] ,cmap="Greens")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title(Model_names[i])
plt.suptitle('Fig 7: Confusion Matrix', size=16)
plt.show()
In the above confusion matrixes, all of the models are giving around 10% of False negatives out of the total predictions. In these False negatives, if we chack in detail. We want to how mant people were given False negatives (i.i.were given<=50K band but were actually >50K)
Since we want to identify bias i.e. False negatives we focus on analysing F1 score. As we have seen in part 1 , the F1 score is low for logR and DT3,DT7 which means these models are producing most number of false predictions.
y = adult_data[21815:32561]
Sensitive_variables = ['Sex','Race','Marital_status','Native_country']
Impacted_groups = ['Female','Non-White','Married-AF-spouse','Low_Mid_Inc']
plt.figure(figsize=[16,10])
# Prevelance of females earning >50K in actual data
for i in range(len(Sensitive_variables)):
plt.subplot(2,2,i+1)
y_prevelance = y[Sensitive_variables[i]][(y.Income_band=='>50K') & (y[Sensitive_variables[i]]==Impacted_groups[i])].count()
y_prevalance_total = y.Income_band[y.Income_band=='>50K'].count()
y_prevelance = y_prevelance/y_prevalance_total*100
y_prevalance_total = y_prevalance_total/(y.Income_band.count())*100
Model_names = ['DT3', 'DT7', 'LogR', 'RandF']
temp=[]
#temp.append(pred_prevelace_total)
temp.append(y_prevelance)
for model in Model_names:
prev_in_pred = (y[predictions[model]=='>50K']).groupby(Sensitive_variables[i])[Sensitive_variables[i]].count() # Total no of people earning >50K in predictions
temp.append((prev_in_pred[prev_in_pred.index==Impacted_groups[i]].values[0]/prev_in_pred.values.sum())*100) #Total females in samples to total false negatives they faced
df = pd.DataFrame(temp,index=[Sensitive_variables[i],'DT3', 'DT7', 'LogR', 'RandF'], columns=['Prevelance'])
sns.barplot(data=df, y=df.index, x=df.Prevelance, orient='h')
plt.title(f'{Sensitive_variables[i]}: {Impacted_groups[i]}')
plt.xlabel(f'fig 8({i+1}): Prevelance in >50K band')
plt.suptitle('Fig 8: Bias Evaluation: Prevelance (Dataset) vs Prevelance (ML Predictions)', size=16)
# plt.tight_layout()
plt.show()
Above plots show, how repersentation of groups in training dataset is carried forward to a large extent, which in turn builds inequality more widely in the society. Potential Sources of these biases could be:
In order to mitigate bias, we have to minimise 'False Negatives' that are seemingly
Plot the below three situations, to show how combination of marginalised groups can cause even higher bias. Plot Prevelance in
Three graphs - for models with above 5 numbers(bars)
Intersection_data = copy.copy(new_adult_data)
X_intersection_1 = Intersection_data[(adult_data.Race=='Non-White') & (adult_data.Sex=='Female')]
y_intersection_1 = X_intersection_1.Income_band
X_intersection_1 = X_intersection_1.drop(['Income_band'],axis=1)
X_intersection_2 = Intersection_data[(adult_data.Race=='White') & (adult_data.Sex=='Male')]
y_intersection_2 = X_intersection_2.Income_band
X_intersection_2 = X_intersection_2.drop(['Income_band'],axis=1)
X_intersection_3 = Intersection_data[(adult_data.Native_country=='Low_Mid_Inc') & (adult_data.Sex=='Female')]
y_intersection_3 = X_intersection_3.Income_band
X_intersection_3 = X_intersection_3.drop(['Income_band'],axis=1)
X_intersection_4 = Intersection_data[(adult_data.Native_country=='High_Inc') & (adult_data.Marital_status=='Married-civ-spouse')]
y_intersection_4 = X_intersection_4.Income_band
X_intersection_4 = X_intersection_4.drop(['Income_band'],axis=1)
X = [X_intersection_1, X_intersection_2, X_intersection_3, X_intersection_4]
Y = [y_intersection_1, y_intersection_2, y_intersection_3, y_intersection_4]
f1=[]
l = len(predictions)
f1DT3 = round((1202/l*100),2)
f1DT7 = round((1139/l*100),2)
f1LogR = round((1470/l*100),2)
f1RandF = round((1032/l*100),2)
f1.append((f1DT3,f1DT7,f1LogR,f1RandF))
for i in range(len(X)):
temp=[]
x_train,x_test,Y_train,Y_test = train_test_split(X[i],Y[i], test_size=0.33, shuffle=False)
DT3_test = DecisionTrees(x_train,Y_train, x_test, 3)
conf_matrix = confusion_matrix(Y_test,DT3_test[1])
TN, FP, FN, TP = conf_matrix.ravel()
FN = conf_matrix.ravel()[2]
temp.append(round((FN/len(Y_test)*100),2))
DT7_test = DecisionTrees(x_train,Y_train, x_test, 7)
conf_matrix = confusion_matrix(Y_test,DT7_test[1])
TN, FP, FN, TP = conf_matrix.ravel()
FN = conf_matrix.ravel()[2]
temp.append(round((FN/len(Y_test)*100),2))
LogR_test = LogisticRegressor(x_train,Y_train, x_test)
conf_matrix = confusion_matrix(Y_test,LogR_test[1])
TN, FP, FN, TP = conf_matrix.ravel()
FN = conf_matrix.ravel()[2]
temp.append(round((FN/len(Y_test)*100),2))
RandF_test = RandomForest(x_train,Y_train, x_test)
conf_matrix = confusion_matrix(Y_test,RandF_test[1])
TN, FP, FN, TP = conf_matrix.ravel()
FN = conf_matrix.ravel()[2]
temp.append(round((FN/len(Y_test)*100),2))
f1.append(temp)
f1 = np.array(f1).T
df = pd.DataFrame(f1, columns=['Original','NonWhite_Female', 'White_Male','LowIncNative_Female','HighIncNative_MarriedtoCiv'], index=['DT3','DT7','LogR','RandF'])
import plotly.graph_objects as go
fig = go.Figure(data=[go.Table(
header=dict(values=list(['Model','Original','NonWhite-Female', 'White-Male','LowIncNative-Female','HighIncNative-MarriedtoCiv']),
fill_color='lightblue',
align='center'),
cells=dict(values=[df.index,df.Original,df.NonWhite_Female, df.White_Male, df.LowIncNative_Female, df.HighIncNative_MarriedtoCiv],
fill_color='ivory',
align='left'))])
fig.update_layout(title_text='False Negative Rate across various intersections of groups vs Original Data', title_x=0.5)
fig.show()
df.plot.bar()
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.xticks(rotation=0)
plt.title('Ratio of False Predictions')
plt.show()
Model_names = ['DT3','DT7','LogR','RandF']
scores=[]
for model in [DT3,DT7,LogR,RandF]:
scores.append(F1Score(y_test, model[1])[1])
rf = pd.DataFrame(scores,index=Model_names, columns=['Recall']).sort_values(by=['Recall'],ascending=True)
fig=go.Figure()
def SetColor(y):
if(y >= 0.60):
return "green"
elif(y >= 0.55):
return "orange"
elif(y >= 0.50):
return "red"
fig.add_trace(go.Bar (name='Recall',x=rf.index, y=rf.Recall, marker=dict(color = list(map(SetColor, rf.Recall)))))
fig.add_trace(go.Scatter(name='Recall Trend', x=rf.index, y=rf.Recall, mode='lines'))
fig.update_layout(autosize=False,width=800,height=300,xaxis_title='Models', yaxis_title='Recall', title='Fig 3: Recall for Models', title_x=0.5)
fig.show()
On the basis of the application that we are dealing with here, we want to avoid False predictions against marginalised groups. Since the apllication and usage for this income_band prediction will be used to calculate credit score, which is an improtant evaluation mteric for banks to sanction loans, we would not want the maringalised groups to get allocated False, specially False negatives (i.e. They actually earn more >50K, but AI system makes a biased predictiona nd puts them in the '<=50K' bracket.
So we would select F1 Score(fig 2 in question 1 of the CW) and Recall to establish which model is most fair.
Lookng at the Global Surrogate Interpretations, we select Education as our actionable variable to check counter factuals. As we can see in Fig 1, the districtuion of Education level maximises at 9. So if upgrade people's education from 5,7,8 to 9, we hope to see some considerable changes.
import copy
CF_data = copy.copy(new_adult_data)
for i in range(len(CF_data.EducationClass )):
if (CF_data.EducationClass[i] in [6,7,8]) & (CF_data.Sex[i]==0):
CF_data.EducationClass[i]+=3
CF_data_output = CF_data.Income_band
CF_data_input = CF_data.drop(['Income_band'],axis=1)
CF_X_train,CF_X_test,CF_y_train,CF_y_test = train_test_split(CF_data_input,CF_data_output, test_size=0.33, shuffle=False) #Data has been split randomly with shuffle on, so that fairness in data selection can be maintained
LogR = LogisticRegressor(CF_X_train,CF_y_train,CF_X_test)
RandF = RandomForest(CF_X_train,CF_y_train,CF_X_test)
F1_LogR = F1Score(CF_y_test,LogR[1])[0]
F1_RandF = F1Score(CF_y_test,RandF[1])[0]
Original = [scores[2],scores[3]]
Counterfactual_female = np.array([Original,[F1_LogR,F1_RandF]]).T
Counterfactual_female = pd.DataFrame(Counterfactual_female, index=['LogR','RandF'], columns=['Orginal','Counterfactual'])
for i in range(len(CF_data.EducationClass )):
if (CF_data.EducationClass[i] in [6,7,8]) & (CF_data.Sex[i]==1):
CF_data.EducationClass[i]+=2
CF_data_output = CF_data.Income_band
CF_data_input = CF_data.drop(['Income_band'],axis=1)
CF_X_train,CF_X_test,CF_y_train,CF_y_test = train_test_split(CF_data_input,CF_data_output, test_size=0.33, shuffle=False) #Data has been split randomly with shuffle on, so that fairness in data selection can be maintained
LogR = LogisticRegressor(CF_X_train,CF_y_train,CF_X_test)
RandF = RandomForest(CF_X_train,CF_y_train,CF_X_test)
F1_LogR = F1Score(CF_y_test,LogR[1])[0]
F1_RandF = F1Score(CF_y_test,RandF[1])[0]
Counterfactual_male = np.array([Original,[F1_LogR,F1_RandF]]).T
Counterfactual_male = pd.DataFrame(Counterfactual_male, index=['LogR','RandF'], columns=['Orginal','Counterfactual'])
Counterfactual_female.plot.bar()
plt.title('F1 Score for Female Education: Original vs CounterFactual')
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
Counterfactual_male.plot.bar()
plt.title('F1 Score for Male Education: Original vs CounterFactual')
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.show()
Observations:
- The Adult Income Dataset was extracted from US Census Database in 1994 by Barry Becker (later donated to the UCL Machime Learning Repository), to study relationships between various demographic and socioeconomic attributes along with individual's income band. The dataset can be used to explore various correlations among these factors to understand the socioeconomic status of people to highlight key issues like inequality, imbalances and state of developement. The primary purpose of our task specifically is to predict the income band of individuals on the basis of their background information.
Attributes of the dataset:
The dataset has 32,561 instances and has 15 attributes ( a mix of categorical and conitnuous values). Each instance consists of the following information about individuals:
- Age - Individual's age in years (from 17 to 90) - (Continuous)
- WorkClass - Employement type (Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked) - (Categorical)
- Final Weight - Sampling weight calculated by US Census Bureau at the time of dataset creation. - (Continuous)
- Education - Highest level of educationa achieved by individual, ['Bachelors','HS-grad','11th','Masters','9th','Some-college',
'Assoc-acdm','Assoc-voc','7th-8th','Doctorate','Prof-school','5th-6th','10th','1st-4th','Preschool','12th'] - (Categorical)- Education Number - Numerical representation of Education level - (Continuous)
- Marital Status - Marital status of individual (Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse) - (Categorical)
- Occupation - Type of individuals' job (Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces - (Categorical)
- Relationship - Role of individual in their family - (Categorical)
- Race - Race of the individual (White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black) - (Categorical)
- Sex - Sexual identity of the individual (Male, Female) - (Categorical
- Capital Gain - Amount of capital gains reported by individual (USD)- (Continuous)
- Capital Loss - Amount of capital gains reported by individual (USD) - (Continuous)
- Hours per week - No. of hours individual works in a week - (Continuous)
- Native Country - Original Nationality of the individual - (Categorical)
- Income - Income band the individual lies in ('<=50K','>50K) USD - (Categorical)
Error, Noise, Redundancies in Dataset:
The data has a few missing values in WorkClass, Occupation and Native Country, that appear in the form of a '?' symbol.
Missing Information in Dataset:
One thing that could be included in the dataset is the length of continuous residency of the individual, as it would give a better verification of individual's background in terms of security.
Imbalanaced Dataset:
The dataset is definitely imbalanced for categorical attributes like Race, Sex, Marital-Status etc. There are uneven number of records present for different classes. This is an important aspect to be taken care of while using the data for Machine Learning purposes.
Data aquiring process:
Each instance was acquired by the national survey conducted by US Census Bureau as a part of census that occurs once every 10 years. The information was collected in through questionares and later digitized in the form of computer records. Each inforamtion in teh dataset is deterministic and has almost no level of uncertainity, unless the information provided by individual was False/incorrect.
Consent for data collection:
It is unlikely that any consent was taken from individuals to get this data as it is mandatory for individuals to share these details to the survey board as a part of law. But by the same constitution, the survey board is bound to keep the details confidential and can not share information of any individual. The dataset that is available does not contain any explicit information of individuals, like Name, Address etc, and coantains details that are insufficient to reveal the identity of a person.
- It can have many uses in the public-policy or market research by organisations, in the field of education, occupation etc.
- It could be used by Data Scientists to extract true causality of predictions that may be masked in the dataset. These can help identify root causes of issues like social disparity, unemployement amomgst youth etc
- It can be greatly used to build data privacy and confidentiality framewoks for large scale datasets which tend to be key targets for illegal data selling agencies.
- Private companies could unethically use this information to manipulate people's choices of things, like political point of views, digital content consumption etc.
The dataset is made available in public domain by the University of California, Irvine and is free to use for non-commercial purposes. The use of data should abide by the US constitution law of confidentiality and privacy, under which it was collected.
The Adult Income Dataset is hosted and maintained by the University of California, Irvine (UCI) Machine Learning Repository. The repository is maintained by the Center for Machine Learning and Intelligent Systems at UCI. UCI is responsible for availability and integrity of the dataset, including storage, access, updates etc.
- Managing Agency Contact: UCI ML Repository can be contacted through their hosted website https://archive.ics.uci.edu/ml/index.php.
As such there is no clear mechanism to contribute to the dataset, but one way is to collect data (with consent) in one's own repository and use it.
Developer:
The model has been developed by Surbhi Goel (MSc Student) as a part of her Coursework for Accountable, Transparent and Responsible AI at the University College London, UK, dated 1st Mar, 2023.
Model Type:
The model is an Artificial Intelligence Ensemble that is based on the Random Forest Algorithm, which is also the least biased model of the four Machine Learning Algorithms compared in the coursework (two Decisions trees with depth 3 and 7, Logistic Regressor, Random Forest).
HyperParameters:
The Random Forest model has been defined with the follwoing parameters. The number has been derieved from the hyperparameter optimising algorithm used during the model development.
- n_estimators=50 : It is the number of decisions tress that form our Random Forest.
- _max_depth=20 : It is the longest allowed path between the root node and leaf nodes.
Hyperparameter Tuning:
The The model has been tuned using GridSearch Cross Validation of sklearn library with following specifications:
- scoring : The scoring or the performance measurement method used is F1 score.
- cv : The cross-validation for tuning is RepeatedStratifiedKFold.
Cross Validation:
The cross-validation method used in the hyperparameter optimisation is RepeatedStratifiedKFold of sklearn library with following specifications:
- n_splits=10 : It is the no. of splits that cross-validation algorithm is using within the hyperparameter optimising model.
- n_repeats=5 : It is the no. of times cross-valodation needs to be repeated.
- random_state=1 : It is used to control random number generator in any algorithm that involves randomization. The seed value is a reference value for the random process.
Model Features:
The model uses social and demographic attributes in the dataset to predict the income band for each instance. So the target variable is Income and the Input variables are the rest of the attributes.
Model Training:
The model has been trained on 77% of the dataset and 33% has been used for testing
Evaluation Metrics:
Keeping in mind the nature of the research that we are perfoming in the task, we choose F1 Score as our evaluation metrics. F1 Score helps identigy False Negatives for imbalannced data very well.
The model is intended to identify instances of racial, sexual or any other type of apparent bias in the dataset, which is identified through the predictions made by the ML model. The model also involves Local and Global interpretability framework that is being used to have concrete conclusions on things like feature importance, probability of individual predictions.
- Local Framework: The local framewok is relevant to check for local interpretations of the model, i.e. interpretation of individual records as to why a predcition has been generatd by the ML model. LIME(Local Interpretable Model-agnostic Explainations) method has been used here.
- Global Framework: The global framework is important to understand the overall sentimnent of the dataset, like which faetures have been given the maximum importance during predictions. Global Surrogate Framework has been here to perform the same.
The most relevant attributes used in the model are Martal-status, Education, Age, Hours_of_week, WorkClass. Other attributes emerge as relevant occasionally but not throughout the dataset.
Predictions from 4 AI models have been compared the overall fairness and performance has been measures through various methods like F1 Score, Recall, Prevelance, False Negatives Ratio in the above tasks.
Race, Sex, Native Country, Marital Status are the sensitive variables that have been focussed on, to identify instances of biases among differnt classes. Task 3 involves various use-cases where fairness of the models have been tried and presented. No concrete biases on the basis of these sensitive attributes were observed.
Sensitive attributes like Race, Sex, Native Country Marital Status have been regrouped in a way such that bias (if any) is more apparent. e.g. Race is divided into 'White' and 'Non-White' because historically 'Non-Whites' in general have been a victim of bias in the society. But for detailed studies, individual classes should be studied to catch biases that are less apparent among classes.